import rbpe 

origin_texts = [
    "Learn about language model tokenization",
    "OpenAI's large language models process text using tokens, which are common sequences of characters found in a set of text. The models learn to understand the statistical relationships between these tokens, and excel at producing the next token in a sequence of tokens",
    "春风拂面花满枝，流水潺潺鸟语时。山川如画心自远，岁月静好梦相依。",
    "「今日も一日頑張りましょう！💪✨🌞」",
]

with open('./data/train.txt', 'r', encoding='utf-8') as file:
    train_str = file.read()

tokenizer = rbpe.BpeTokenizer(train_str, 100)

for origin_text in origin_texts:
    ids = tokenizer.encode(origin_text)
    text = tokenizer.decode(ids)
    assert origin_text == text
    print(text)
